## ------------------------------------------------------------------------- ##
## Aggregates the household level DHS data for each survey cluster.
## India 2015-16 DHS.
## ------------------------------------------------------------------------- ##

# set working directory as appropriate
wd <- "."
setwd(wd)

library(foreign)
library(rgdal)

# DHS survey dataset files
# These files can be downloaded from DHS website after registering
# and requesting access:
# https://dhsprogram.com/data/dataset/India_Standard-DHS_2015.cfm?flag=1

dhs_household_survey_file <- "dhs_files/IAHR74DT/IAHR74FL.dta"
geo_folder <- "dhs_files/IAGE71FL"
geo_file <- "IAGE71FL"

# data file
datas <- read.dta(dhs_household_survey_file)

# geo shape files for DHS clusters
geo_datas <- readOGR(dsn = geo_folder, layer = geo_file)

# summary statistics
summary(datas$hv015) # interview status
summary(datas$hv021) # PSU
summary(datas$hv206) # electricity
summary(datas$hv207) # radio
summary(datas$hv208) # TV
summary(datas$hv209) # refrigerator
summary(datas$hv243a) # mobile phone
summary(datas$hv270) # wealth index
summary(datas$hv271) # wealth index factor score

## aggregate the data we need per cluster level
cluster_data <- data.frame(cluster = as.character(geo_datas@data$DHSCLUST))
cluster_data$num_households <- NA
cluster_data$has_electricity <- NA
cluster_data$has_radio <- NA
cluster_data$has_TV <- NA
cluster_data$has_refrigerator <- NA
cluster_data$has_mobile <- NA
cluster_data$cluster_mean_wealth_index <- NA

nclust <- nrow(cluster_data)
for (i in 1:nclust) {
  cat("cluster:",i,"of:",nclust,"                  \r")

  # the cluster
  clust <- as.character(cluster_data$cluster[i])
  I <- as.character(datas$hv001) == clust
  
  # number households
  num_households <- sum(I)
  # print(paste("Cluster:",clust,"total households:", num_households))
  cluster_data$num_households[i] <- num_households
  
  # skip clusters with no households
  if (cluster_data$num_households[i] == 0) { 
    print(paste("Cluster:",clust,"total households:", num_households,"; Skipping cluster."))
    next 
  }
  
  ## asset ownership
  cluster_data$has_electricity[i] <- sum(datas$hv206[I] == "yes")
  cluster_data$has_radio[i] <- sum(datas$hv207[I] == "yes")
  cluster_data$has_TV[i] <- sum(datas$hv208[I] == "yes")
  cluster_data$has_refrigerator[i] <- sum(datas$hv209[I] == "yes")
  cluster_data$has_mobile[i] <- sum(datas$hv243a[I] == "yes")

  cluster_data$cluster_mean_wealth_index[i] <- mean(datas$hv271[I])
}

# compute fraction of households with different assets
assets <- c("has_electricity","has_radio","has_TV","has_refrigerator",
            "has_mobile")

for (item in assets) {
  item_frac <- paste(item,"frac",sep="_")
  cluster_data[item_frac] <- cluster_data[,item]/cluster_data$num_households
}

# merge with cluster geographic information
geo_sub <- geo_datas@data
geo_sub$DHSCLUST <- as.character(geo_sub$DHSCLUST)
geo_sub <- merge(x = geo_sub, by.x = "DHSCLUST", 
                 y = cluster_data, by.y = "cluster", all.x = TRUE)

# create the regional indicator variables
dhs_regions <- levels(as.factor(geo_sub$ADM1NAME))
for (reg in dhs_regions) {
  reg_dummy_var <- paste("region_is",reg,sep="_")
  geo_sub[reg_dummy_var] <- ifelse(as.character(geo_sub$ADM1NAME) == reg, 1, 0)
}

# save
write.csv(geo_sub,"India_dhs_dataset.csv", row.names = FALSE)